{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 离散值处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import warnings  # 忽略普通警告，不打印太多东西\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Name</th>\n",
       "      <th>Platform</th>\n",
       "      <th>Year</th>\n",
       "      <th>Genre</th>\n",
       "      <th>Publisher</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Super Mario Bros.</td>\n",
       "      <td>NES</td>\n",
       "      <td>1985.0</td>\n",
       "      <td>Platform</td>\n",
       "      <td>Nintendo</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Mario Kart Wii</td>\n",
       "      <td>Wii</td>\n",
       "      <td>2008.0</td>\n",
       "      <td>Racing</td>\n",
       "      <td>Nintendo</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Wii Sports Resort</td>\n",
       "      <td>Wii</td>\n",
       "      <td>2009.0</td>\n",
       "      <td>Sports</td>\n",
       "      <td>Nintendo</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Pokemon Red/Pokemon Blue</td>\n",
       "      <td>GB</td>\n",
       "      <td>1996.0</td>\n",
       "      <td>Role-Playing</td>\n",
       "      <td>Nintendo</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Tetris</td>\n",
       "      <td>GB</td>\n",
       "      <td>1989.0</td>\n",
       "      <td>Puzzle</td>\n",
       "      <td>Nintendo</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>New Super Mario Bros.</td>\n",
       "      <td>DS</td>\n",
       "      <td>2006.0</td>\n",
       "      <td>Platform</td>\n",
       "      <td>Nintendo</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       Name Platform    Year         Genre Publisher\n",
       "1         Super Mario Bros.      NES  1985.0      Platform  Nintendo\n",
       "2            Mario Kart Wii      Wii  2008.0        Racing  Nintendo\n",
       "3         Wii Sports Resort      Wii  2009.0        Sports  Nintendo\n",
       "4  Pokemon Red/Pokemon Blue       GB  1996.0  Role-Playing  Nintendo\n",
       "5                    Tetris       GB  1989.0        Puzzle  Nintendo\n",
       "6     New Super Mario Bros.       DS  2006.0      Platform  Nintendo"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vg_df = pd.read_csv('data/vgsales.csv', encoding='ISO-8859-1')\n",
    "vg_df[['Name', 'Platform', 'Year', 'Genre', 'Publisher']].iloc[1:7]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "机器无法识别字符串类型数据，需要做处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['Action', 'Adventure', 'Fighting', 'Misc', 'Platform', 'Puzzle',\n",
       "       'Racing', 'Role-Playing', 'Shooter', 'Simulation', 'Sports',\n",
       "       'Strategy'], dtype=object)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "genres = np.unique(vg_df['Genre'])\n",
    "genres  # 不同的字符串并不多"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## LabelEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{0: 'Action',\n",
       " 1: 'Adventure',\n",
       " 2: 'Fighting',\n",
       " 3: 'Misc',\n",
       " 4: 'Platform',\n",
       " 5: 'Puzzle',\n",
       " 6: 'Racing',\n",
       " 7: 'Role-Playing',\n",
       " 8: 'Shooter',\n",
       " 9: 'Simulation',\n",
       " 10: 'Sports',\n",
       " 11: 'Strategy'}"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "gle = LabelEncoder()  # 实例化\n",
    "genre_labels = gle.fit_transform(vg_df['Genre'])  # 转换需要离散值的一列\n",
    "genre_mappings = {index: label for index, label in enumerate(gle.classes_)}\n",
    "genre_mappings  # 映射成数值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Name</th>\n",
       "      <th>Platform</th>\n",
       "      <th>Year</th>\n",
       "      <th>Genre</th>\n",
       "      <th>GenreLabel</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Super Mario Bros.</td>\n",
       "      <td>NES</td>\n",
       "      <td>1985.0</td>\n",
       "      <td>Platform</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Mario Kart Wii</td>\n",
       "      <td>Wii</td>\n",
       "      <td>2008.0</td>\n",
       "      <td>Racing</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Wii Sports Resort</td>\n",
       "      <td>Wii</td>\n",
       "      <td>2009.0</td>\n",
       "      <td>Sports</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Pokemon Red/Pokemon Blue</td>\n",
       "      <td>GB</td>\n",
       "      <td>1996.0</td>\n",
       "      <td>Role-Playing</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Tetris</td>\n",
       "      <td>GB</td>\n",
       "      <td>1989.0</td>\n",
       "      <td>Puzzle</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>New Super Mario Bros.</td>\n",
       "      <td>DS</td>\n",
       "      <td>2006.0</td>\n",
       "      <td>Platform</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       Name Platform    Year         Genre  GenreLabel\n",
       "1         Super Mario Bros.      NES  1985.0      Platform           4\n",
       "2            Mario Kart Wii      Wii  2008.0        Racing           6\n",
       "3         Wii Sports Resort      Wii  2009.0        Sports          10\n",
       "4  Pokemon Red/Pokemon Blue       GB  1996.0  Role-Playing           7\n",
       "5                    Tetris       GB  1989.0        Puzzle           5\n",
       "6     New Super Mario Bros.       DS  2006.0      Platform           4"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vg_df['GenreLabel'] = genre_labels  # 赋值到一列\n",
    "vg_df[['Name', 'Platform', 'Year', 'Genre', 'GenreLabel']].iloc[1:7]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Map\n",
    "自己建一个字典"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Action': 0,\n",
       " 'Adventure': 1,\n",
       " 'Fighting': 2,\n",
       " 'Misc': 3,\n",
       " 'Platform': 4,\n",
       " 'Puzzle': 5,\n",
       " 'Racing': 6,\n",
       " 'Role-Playing': 7,\n",
       " 'Shooter': 8,\n",
       " 'Simulation': 9,\n",
       " 'Sports': 10,\n",
       " 'Strategy': 11}"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gen_ord_map = {label:index  for index, label in enumerate(gle.classes_)}\n",
    "gen_ord_map"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Name</th>\n",
       "      <th>Genre</th>\n",
       "      <th>GenreLabel</th>\n",
       "      <th>GenreMap</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Super Mario Bros.</td>\n",
       "      <td>Platform</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Mario Kart Wii</td>\n",
       "      <td>Racing</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Wii Sports Resort</td>\n",
       "      <td>Sports</td>\n",
       "      <td>10</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Pokemon Red/Pokemon Blue</td>\n",
       "      <td>Role-Playing</td>\n",
       "      <td>7</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Tetris</td>\n",
       "      <td>Puzzle</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>New Super Mario Bros.</td>\n",
       "      <td>Platform</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       Name         Genre  GenreLabel  GenreMap\n",
       "1         Super Mario Bros.      Platform           4         4\n",
       "2            Mario Kart Wii        Racing           6         6\n",
       "3         Wii Sports Resort        Sports          10        10\n",
       "4  Pokemon Red/Pokemon Blue  Role-Playing           7         7\n",
       "5                    Tetris        Puzzle           5         5\n",
       "6     New Super Mario Bros.      Platform           4         4"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vg_df['GenreMap'] = vg_df['Genre'].map(gen_ord_map)\n",
    "vg_df[['Name', 'Genre', 'GenreLabel', 'GenreMap']].iloc[1:7]  # 结果呈现我们设置的map"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## One-Hot Encoder\n",
    "对于离散型特征，基于树的方法是不需要使用one-hot编码的，例如随机森林等。基于距离的模型，都是要使用one-hot编码，例如神经网络等。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0., 0., 0., ..., 0., 1., 0.],\n",
       "       [0., 0., 0., ..., 0., 0., 0.],\n",
       "       [0., 0., 0., ..., 0., 0., 0.],\n",
       "       ...,\n",
       "       [0., 0., 0., ..., 0., 0., 0.],\n",
       "       [0., 0., 0., ..., 0., 0., 0.],\n",
       "       [0., 0., 0., ..., 0., 0., 0.]])"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.preprocessing import OneHotEncoder\n",
    "# 获取onehot后的结果，将字符串变成多列的0/1值，有则为1，无则为0\n",
    "gen_ohe = OneHotEncoder()\n",
    "gen_feature_arr = gen_ohe.fit_transform(vg_df[['GenreLabel']]).toarray()\n",
    "gen_feature_arr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Action</th>\n",
       "      <th>Adventure</th>\n",
       "      <th>Fighting</th>\n",
       "      <th>Misc</th>\n",
       "      <th>Platform</th>\n",
       "      <th>Puzzle</th>\n",
       "      <th>Racing</th>\n",
       "      <th>Role-Playing</th>\n",
       "      <th>Shooter</th>\n",
       "      <th>Simulation</th>\n",
       "      <th>Sports</th>\n",
       "      <th>Strategy</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Action  Adventure  Fighting  Misc  Platform  Puzzle  Racing  Role-Playing  \\\n",
       "0     0.0        0.0       0.0   0.0       0.0     0.0     0.0           0.0   \n",
       "1     0.0        0.0       0.0   0.0       1.0     0.0     0.0           0.0   \n",
       "2     0.0        0.0       0.0   0.0       0.0     0.0     1.0           0.0   \n",
       "3     0.0        0.0       0.0   0.0       0.0     0.0     0.0           0.0   \n",
       "4     0.0        0.0       0.0   0.0       0.0     0.0     0.0           1.0   \n",
       "\n",
       "   Shooter  Simulation  Sports  Strategy  \n",
       "0      0.0         0.0     1.0       0.0  \n",
       "1      0.0         0.0     0.0       0.0  \n",
       "2      0.0         0.0     0.0       0.0  \n",
       "3      0.0         0.0     1.0       0.0  \n",
       "4      0.0         0.0     0.0       0.0  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "genres = np.unique(vg_df['Genre'])  # 获取全部不同的字符串\n",
    "gen_features = pd.DataFrame(gen_feature_arr, columns=genres)  # 将字符串作为列，合并onehot数据\n",
    "gen_features.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Name</th>\n",
       "      <th>Genre</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Wii Sports</td>\n",
       "      <td>Sports</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Super Mario Bros.</td>\n",
       "      <td>Platform</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Mario Kart Wii</td>\n",
       "      <td>Racing</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Wii Sports Resort</td>\n",
       "      <td>Sports</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Pokemon Red/Pokemon Blue</td>\n",
       "      <td>Role-Playing</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       Name         Genre\n",
       "0                Wii Sports        Sports\n",
       "1         Super Mario Bros.      Platform\n",
       "2            Mario Kart Wii        Racing\n",
       "3         Wii Sports Resort        Sports\n",
       "4  Pokemon Red/Pokemon Blue  Role-Playing"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 拿出两列原本的数据，实际场景中是全部数据合并，这里是为了查看方便\n",
    "vg_df_2 = vg_df[['Name', 'Genre']]\n",
    "vg_df_2.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Name</th>\n",
       "      <th>Genre</th>\n",
       "      <th>Action</th>\n",
       "      <th>Adventure</th>\n",
       "      <th>Fighting</th>\n",
       "      <th>Misc</th>\n",
       "      <th>Platform</th>\n",
       "      <th>Puzzle</th>\n",
       "      <th>Racing</th>\n",
       "      <th>Role-Playing</th>\n",
       "      <th>Shooter</th>\n",
       "      <th>Simulation</th>\n",
       "      <th>Sports</th>\n",
       "      <th>Strategy</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Wii Sports</td>\n",
       "      <td>Sports</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Super Mario Bros.</td>\n",
       "      <td>Platform</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Mario Kart Wii</td>\n",
       "      <td>Racing</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Wii Sports Resort</td>\n",
       "      <td>Sports</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Pokemon Red/Pokemon Blue</td>\n",
       "      <td>Role-Playing</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       Name         Genre  Action  Adventure  Fighting  Misc  \\\n",
       "0                Wii Sports        Sports     0.0        0.0       0.0   0.0   \n",
       "1         Super Mario Bros.      Platform     0.0        0.0       0.0   0.0   \n",
       "2            Mario Kart Wii        Racing     0.0        0.0       0.0   0.0   \n",
       "3         Wii Sports Resort        Sports     0.0        0.0       0.0   0.0   \n",
       "4  Pokemon Red/Pokemon Blue  Role-Playing     0.0        0.0       0.0   0.0   \n",
       "\n",
       "   Platform  Puzzle  Racing  Role-Playing  Shooter  Simulation  Sports  \\\n",
       "0       0.0     0.0     0.0           0.0      0.0         0.0     1.0   \n",
       "1       1.0     0.0     0.0           0.0      0.0         0.0     0.0   \n",
       "2       0.0     0.0     1.0           0.0      0.0         0.0     0.0   \n",
       "3       0.0     0.0     0.0           0.0      0.0         0.0     1.0   \n",
       "4       0.0     0.0     0.0           1.0      0.0         0.0     0.0   \n",
       "\n",
       "   Strategy  \n",
       "0       0.0  \n",
       "1       0.0  \n",
       "2       0.0  \n",
       "3       0.0  \n",
       "4       0.0  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vg_df_ohe = pd.concat([vg_df_2,gen_features],axis=1)  # 两个数据合并\n",
    "vg_df_ohe.head()  # 可以看到Platform列第二行为1，对应着Genre列第二行是Platform字符串"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Get Dummy\n",
    "更加实用的onehot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(16598, 13)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Name</th>\n",
       "      <th>Genre</th>\n",
       "      <th>Adventure</th>\n",
       "      <th>Fighting</th>\n",
       "      <th>Misc</th>\n",
       "      <th>Platform</th>\n",
       "      <th>Puzzle</th>\n",
       "      <th>Racing</th>\n",
       "      <th>Role-Playing</th>\n",
       "      <th>Shooter</th>\n",
       "      <th>Simulation</th>\n",
       "      <th>Sports</th>\n",
       "      <th>Strategy</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Wii Sports</td>\n",
       "      <td>Sports</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Super Mario Bros.</td>\n",
       "      <td>Platform</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Mario Kart Wii</td>\n",
       "      <td>Racing</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Wii Sports Resort</td>\n",
       "      <td>Sports</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Pokemon Red/Pokemon Blue</td>\n",
       "      <td>Role-Playing</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       Name         Genre  Adventure  Fighting  Misc  \\\n",
       "0                Wii Sports        Sports          0         0     0   \n",
       "1         Super Mario Bros.      Platform          0         0     0   \n",
       "2            Mario Kart Wii        Racing          0         0     0   \n",
       "3         Wii Sports Resort        Sports          0         0     0   \n",
       "4  Pokemon Red/Pokemon Blue  Role-Playing          0         0     0   \n",
       "\n",
       "   Platform  Puzzle  Racing  Role-Playing  Shooter  Simulation  Sports  \\\n",
       "0         0       0       0             0        0           0       1   \n",
       "1         1       0       0             0        0           0       0   \n",
       "2         0       0       1             0        0           0       0   \n",
       "3         0       0       0             0        0           0       1   \n",
       "4         0       0       0             1        0           0       0   \n",
       "\n",
       "   Strategy  \n",
       "0         0  \n",
       "1         0  \n",
       "2         0  \n",
       "3         0  \n",
       "4         0  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gen_dummy_features = pd.get_dummies(vg_df['Genre'],drop_first=True)  # drop_first=True删掉全为0的列\n",
    "dummy_df = pd.concat([vg_df[['Name', 'Genre']], gen_dummy_features], axis=1)\n",
    "print(dummy_df.shape)\n",
    "dummy_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "可以看到两句话就解决了我们上面那一长串"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(16598, 14)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Name</th>\n",
       "      <th>Genre</th>\n",
       "      <th>Action</th>\n",
       "      <th>Adventure</th>\n",
       "      <th>Fighting</th>\n",
       "      <th>Misc</th>\n",
       "      <th>Platform</th>\n",
       "      <th>Puzzle</th>\n",
       "      <th>Racing</th>\n",
       "      <th>Role-Playing</th>\n",
       "      <th>Shooter</th>\n",
       "      <th>Simulation</th>\n",
       "      <th>Sports</th>\n",
       "      <th>Strategy</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Wii Sports</td>\n",
       "      <td>Sports</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Super Mario Bros.</td>\n",
       "      <td>Platform</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Mario Kart Wii</td>\n",
       "      <td>Racing</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Wii Sports Resort</td>\n",
       "      <td>Sports</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Pokemon Red/Pokemon Blue</td>\n",
       "      <td>Role-Playing</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       Name         Genre  Action  Adventure  Fighting  Misc  \\\n",
       "0                Wii Sports        Sports       0          0         0     0   \n",
       "1         Super Mario Bros.      Platform       0          0         0     0   \n",
       "2            Mario Kart Wii        Racing       0          0         0     0   \n",
       "3         Wii Sports Resort        Sports       0          0         0     0   \n",
       "4  Pokemon Red/Pokemon Blue  Role-Playing       0          0         0     0   \n",
       "\n",
       "   Platform  Puzzle  Racing  Role-Playing  Shooter  Simulation  Sports  \\\n",
       "0         0       0       0             0        0           0       1   \n",
       "1         1       0       0             0        0           0       0   \n",
       "2         0       0       1             0        0           0       0   \n",
       "3         0       0       0             0        0           0       1   \n",
       "4         0       0       0             1        0           0       0   \n",
       "\n",
       "   Strategy  \n",
       "0         0  \n",
       "1         0  \n",
       "2         0  \n",
       "3         0  \n",
       "4         0  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gen_dummy_features = pd.get_dummies(vg_df['Genre'])  # 和上面相比少了drop_first=True，一般用这种\n",
    "dummy_df_true = pd.concat([vg_df[['Name', 'Genre']], gen_dummy_features], axis=1)\n",
    "print(dummy_df_true.shape)\n",
    "dummy_df_true.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 二值特征化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Name</th>\n",
       "      <th>Year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Wii Sports</td>\n",
       "      <td>2006.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Super Mario Bros.</td>\n",
       "      <td>1985.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Mario Kart Wii</td>\n",
       "      <td>2008.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Wii Sports Resort</td>\n",
       "      <td>2009.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Pokemon Red/Pokemon Blue</td>\n",
       "      <td>1996.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       Name    Year\n",
       "0                Wii Sports  2006.0\n",
       "1         Super Mario Bros.  1985.0\n",
       "2            Mario Kart Wii  2008.0\n",
       "3         Wii Sports Resort  2009.0\n",
       "4  Pokemon Red/Pokemon Blue  1996.0"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vg_year_df = vg_df[['Name', 'Year']]\n",
    "vg_year_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "我们把2000年以上的归类为1，其它归类为0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Name</th>\n",
       "      <th>Year</th>\n",
       "      <th>Year_tow</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Wii Sports</td>\n",
       "      <td>2006.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Super Mario Bros.</td>\n",
       "      <td>1985.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Mario Kart Wii</td>\n",
       "      <td>2008.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Wii Sports Resort</td>\n",
       "      <td>2009.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Pokemon Red/Pokemon Blue</td>\n",
       "      <td>1996.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       Name    Year  Year_tow\n",
       "0                Wii Sports  2006.0         1\n",
       "1         Super Mario Bros.  1985.0         0\n",
       "2            Mario Kart Wii  2008.0         1\n",
       "3         Wii Sports Resort  2009.0         1\n",
       "4  Pokemon Red/Pokemon Blue  1996.0         0"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vg_year_df['Year_tow'] = np.where(vg_year_df['Year'] >= 2000, 1, 0)\n",
    "vg_year_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Name</th>\n",
       "      <th>Year</th>\n",
       "      <th>Year_tow</th>\n",
       "      <th>bn_year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Wii Sports</td>\n",
       "      <td>2006.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Super Mario Bros.</td>\n",
       "      <td>1985.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Mario Kart Wii</td>\n",
       "      <td>2008.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Wii Sports Resort</td>\n",
       "      <td>2009.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Pokemon Red/Pokemon Blue</td>\n",
       "      <td>1996.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       Name    Year  Year_tow  bn_year\n",
       "0                Wii Sports  2006.0         1      1.0\n",
       "1         Super Mario Bros.  1985.0         0      0.0\n",
       "2            Mario Kart Wii  2008.0         1      1.0\n",
       "3         Wii Sports Resort  2009.0         1      1.0\n",
       "4  Pokemon Red/Pokemon Blue  1996.0         0      0.0"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.preprocessing import Binarizer\n",
    "# sklearn中的方法\n",
    "bn = Binarizer(threshold=2000)  # 大于2000我1，小于为0\n",
    "vg_year_df['Year']=vg_year_df['Year'].fillna(0)  # 数据中有Nan值，需要补0，否则无法二分\n",
    "bn_year = bn.transform([vg_year_df['Year']])[0]  # 获取转换的值，取第0列\n",
    "vg_year_df['bn_year'] = bn_year  # 插入数据\n",
    "vg_year_df.head()  # 结果与手动一致"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 多项式特征\n",
    "获得特征的更高维度和互相间关系的项。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>NA_Sales</th>\n",
       "      <th>EU_Sales</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>41.49</td>\n",
       "      <td>29.02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>29.08</td>\n",
       "      <td>3.58</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>15.85</td>\n",
       "      <td>12.88</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>15.75</td>\n",
       "      <td>11.01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>11.27</td>\n",
       "      <td>8.89</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   NA_Sales  EU_Sales\n",
       "0     41.49     29.02\n",
       "1     29.08      3.58\n",
       "2     15.85     12.88\n",
       "3     15.75     11.01\n",
       "4     11.27      8.89"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "polynomial_df = vg_df[['NA_Sales', 'EU_Sales']]\n",
    "polynomial_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[4.1490000e+01, 2.9020000e+01, 1.7214201e+03, 1.2040398e+03,\n",
       "        8.4216040e+02],\n",
       "       [2.9080000e+01, 3.5800000e+00, 8.4564640e+02, 1.0410640e+02,\n",
       "        1.2816400e+01],\n",
       "       [1.5850000e+01, 1.2880000e+01, 2.5122250e+02, 2.0414800e+02,\n",
       "        1.6589440e+02],\n",
       "       ...,\n",
       "       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,\n",
       "        0.0000000e+00],\n",
       "       [0.0000000e+00, 1.0000000e-02, 0.0000000e+00, 0.0000000e+00,\n",
       "        1.0000000e-04],\n",
       "       [1.0000000e-02, 0.0000000e+00, 1.0000000e-04, 0.0000000e+00,\n",
       "        0.0000000e+00]])"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.preprocessing import PolynomialFeatures\n",
    "\n",
    "# degree二次幂的复杂度\n",
    "pf = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)\n",
    "res = pf.fit_transform(polynomial_df)\n",
    "res"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "以第一行为例：\n",
    "<br>第一列和第二列分别表示原先的第一列和第二列\n",
    "<br>第三列和第五列表示第一列和第二列分别的平方，第四列表示两者的乘积"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>NA_Sales</th>\n",
       "      <th>EU_Sales</th>\n",
       "      <th>NA_Sales^2</th>\n",
       "      <th>NA_Sales*EU_Sales</th>\n",
       "      <th>EU_Sales^2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>41.49</td>\n",
       "      <td>29.02</td>\n",
       "      <td>1721.4201</td>\n",
       "      <td>1204.0398</td>\n",
       "      <td>842.1604</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>29.08</td>\n",
       "      <td>3.58</td>\n",
       "      <td>845.6464</td>\n",
       "      <td>104.1064</td>\n",
       "      <td>12.8164</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>15.85</td>\n",
       "      <td>12.88</td>\n",
       "      <td>251.2225</td>\n",
       "      <td>204.1480</td>\n",
       "      <td>165.8944</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>15.75</td>\n",
       "      <td>11.01</td>\n",
       "      <td>248.0625</td>\n",
       "      <td>173.4075</td>\n",
       "      <td>121.2201</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>11.27</td>\n",
       "      <td>8.89</td>\n",
       "      <td>127.0129</td>\n",
       "      <td>100.1903</td>\n",
       "      <td>79.0321</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   NA_Sales  EU_Sales  NA_Sales^2  NA_Sales*EU_Sales  EU_Sales^2\n",
       "0     41.49     29.02   1721.4201          1204.0398    842.1604\n",
       "1     29.08      3.58    845.6464           104.1064     12.8164\n",
       "2     15.85     12.88    251.2225           204.1480    165.8944\n",
       "3     15.75     11.01    248.0625           173.4075    121.2201\n",
       "4     11.27      8.89    127.0129           100.1903     79.0321"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "intr_features = pd.DataFrame(res, columns=['NA_Sales',\n",
    "                                           'EU_Sales',\n",
    "                                           'NA_Sales^2',\n",
    "                                           'NA_Sales*EU_Sales',\n",
    "                                           'EU_Sales^2'])\n",
    "intr_features.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Rank</th>\n",
       "      <th>Name</th>\n",
       "      <th>Platform</th>\n",
       "      <th>Year</th>\n",
       "      <th>Genre</th>\n",
       "      <th>Publisher</th>\n",
       "      <th>NA_Sales</th>\n",
       "      <th>EU_Sales</th>\n",
       "      <th>JP_Sales</th>\n",
       "      <th>Other_Sales</th>\n",
       "      <th>Global_Sales</th>\n",
       "      <th>GenreLabel</th>\n",
       "      <th>GenreMap</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Wii Sports</td>\n",
       "      <td>Wii</td>\n",
       "      <td>2006.0</td>\n",
       "      <td>Sports</td>\n",
       "      <td>Nintendo</td>\n",
       "      <td>41.49</td>\n",
       "      <td>29.02</td>\n",
       "      <td>3.77</td>\n",
       "      <td>8.46</td>\n",
       "      <td>82.74</td>\n",
       "      <td>10</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Super Mario Bros.</td>\n",
       "      <td>NES</td>\n",
       "      <td>1985.0</td>\n",
       "      <td>Platform</td>\n",
       "      <td>Nintendo</td>\n",
       "      <td>29.08</td>\n",
       "      <td>3.58</td>\n",
       "      <td>6.81</td>\n",
       "      <td>0.77</td>\n",
       "      <td>40.24</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Mario Kart Wii</td>\n",
       "      <td>Wii</td>\n",
       "      <td>2008.0</td>\n",
       "      <td>Racing</td>\n",
       "      <td>Nintendo</td>\n",
       "      <td>15.85</td>\n",
       "      <td>12.88</td>\n",
       "      <td>3.79</td>\n",
       "      <td>3.31</td>\n",
       "      <td>35.82</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>Wii Sports Resort</td>\n",
       "      <td>Wii</td>\n",
       "      <td>2009.0</td>\n",
       "      <td>Sports</td>\n",
       "      <td>Nintendo</td>\n",
       "      <td>15.75</td>\n",
       "      <td>11.01</td>\n",
       "      <td>3.28</td>\n",
       "      <td>2.96</td>\n",
       "      <td>33.00</td>\n",
       "      <td>10</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>Pokemon Red/Pokemon Blue</td>\n",
       "      <td>GB</td>\n",
       "      <td>1996.0</td>\n",
       "      <td>Role-Playing</td>\n",
       "      <td>Nintendo</td>\n",
       "      <td>11.27</td>\n",
       "      <td>8.89</td>\n",
       "      <td>10.22</td>\n",
       "      <td>1.00</td>\n",
       "      <td>31.37</td>\n",
       "      <td>7</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Rank                      Name Platform    Year         Genre Publisher  \\\n",
       "0     1                Wii Sports      Wii  2006.0        Sports  Nintendo   \n",
       "1     2         Super Mario Bros.      NES  1985.0      Platform  Nintendo   \n",
       "2     3            Mario Kart Wii      Wii  2008.0        Racing  Nintendo   \n",
       "3     4         Wii Sports Resort      Wii  2009.0        Sports  Nintendo   \n",
       "4     5  Pokemon Red/Pokemon Blue       GB  1996.0  Role-Playing  Nintendo   \n",
       "\n",
       "   NA_Sales  EU_Sales  JP_Sales  Other_Sales  Global_Sales  GenreLabel  \\\n",
       "0     41.49     29.02      3.77         8.46         82.74          10   \n",
       "1     29.08      3.58      6.81         0.77         40.24           4   \n",
       "2     15.85     12.88      3.79         3.31         35.82           6   \n",
       "3     15.75     11.01      3.28         2.96         33.00          10   \n",
       "4     11.27      8.89     10.22         1.00         31.37           7   \n",
       "\n",
       "   GenreMap  \n",
       "0        10  \n",
       "1         4  \n",
       "2         6  \n",
       "3        10  \n",
       "4         7  "
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vg_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Binning 特征\n",
    "一般用来处理年龄"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Name</th>\n",
       "      <th>Year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Wii Sports</td>\n",
       "      <td>2006.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Super Mario Bros.</td>\n",
       "      <td>1985.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Mario Kart Wii</td>\n",
       "      <td>2008.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Wii Sports Resort</td>\n",
       "      <td>2009.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Pokemon Red/Pokemon Blue</td>\n",
       "      <td>1996.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       Name    Year\n",
       "0                Wii Sports  2006.0\n",
       "1         Super Mario Bros.  1985.0\n",
       "2            Mario Kart Wii  2008.0\n",
       "3         Wii Sports Resort  2009.0\n",
       "4  Pokemon Red/Pokemon Blue  1996.0"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bin_df = vg_df[['Name','Year']]  # 假设GenreLabel是年龄\n",
    "bin_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Text(0, 0.5, 'Frequency')"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY4AAAEZCAYAAACAZ8KHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nO3de5xdVX338c+XJNwSIMFASi4SFKogKkK4qmUA5aYQ+lQUSyUgPmihrbaooIKKiqKlgrZVpBDlHlMUiQpiBKY+FoOA3ALhEiCQISkRMglJJEDw9/yx1ml2hpnJrJlzmWS+79frvM7ea99+Z519zu+stffZWxGBmZlZX23S6gDMzGzD4sRhZmZFnDjMzKyIE4eZmRVx4jAzsyJOHGZmVsSJw+pOUrukj7Q6jr6SFJJ27sN8J0r6TT+30Sapoz/LFmxjgaR3NXIbZuDEsdHKXyIvSFohaZmk2yR9TNKQes8lbSXpm7k+Vkl6StK1kvZpdWzdkfSO/F4tl7RU0n9L2ruF8bwqGQ0kgVbW0fBEao0zpL5EhqCjImIrYEfgPOAM4NLWhtQYkoZ3U7YZcAvwZuC9wNbArsAM4MimBtgHkrYGfgb8K7AtMAE4B3ixlXFtiCQNa3UMGzMnjiEgIpZHxCzgA8A0SbtD+mKVdH7+Ff6MpIskbZGnzZP03to6JA2X9KykPfP4fvmX8TJJ90pq627bkjaRdJakJyUtkXS5pG3ytMm5m+gUSYskLZZ0epdlz5T0mKTnJM2UtG2XZU+W9BQpQXT1IWAicExEzI2IVyJiVURcGxFf7CHebXKMf8gxn9WllSZJ/5pbBA9JOqQy4aRcbyskPS7po+t9c9b15wARcU2O9YWI+GVE3JfX/3pJt+S6eFbSVZJG9/A6equ7zSVdmcuXSbpD0rjCWKvb2jV3Ty6T9ICkoyvTjpT0YK6TpyV9UtJI4EZgvKSV+TE+748X5n1hUR7erLKuT+d9ZJGkj6jSxSjpB5K+K+kGSauAgyS9R9Ldkp6XtFDSFyvrqu0/J+VpnUot8r0l3Zdfy7/1t042ehHhx0b4ABYA7+qm/Cngb/PwhcAs0q/brYCfAl/L0z4PXFVZ7j3AQ3l4AvAc6Vf7JsC78/h2eXo78JE8/GFgPvA6YBTwY+CKPG0yEMA1wEhSy+APtbiBTwBzSF/+mwHfA67psuzledktunmtM4Af9KGuAtg5D18OXJ/rYzLwCHBynnYisAb4R2AEKREvB7at1NHrAQEHAn8E9szT2oCO9cSxda7Hy4AjgDFdpu+c63ozYDvg18CF3b3n66m7j+b3ektgGLAXsHVf96NcD7/JwyPy+/tZYFPgYGAF8IY8fTHwzjw8prf6AL6UY94+v77bgC/naYcD/wO8Kcd9RZf37Qf5vXg7aZ/cPG/jzXn8LcAzpB8R1f3nojzvocBq4Cd5+xOAJcCBrf4sD8ZHywPwo0FvbM+JYw7wufzltgp4fWXa/sATeXjn/AWwZR6/Cvh8Hj6D/OVfWfYmYFoebmdt4rgZOLUy3xuAl4HhlQ/vGyvTvwFcmofnAYdUpu3QzbKv66UOfgWcVxnfA1gGPA88XCmP/HqHkbqFdqtM+yjQnodPBBYBqkz/HfChHrb/E+DjebiN9SSOPN+u+Uuwg5SkZgHjepj3GODu7t7z9dTdh0lfym/p4360Mtdb7fFH1iaOd5K+0DepLHMN8MU8/FSuw627rPdV9QE8BhxZGT8MWJCHp5N/1FT2z66J4/L1vJYLgQvycG3/mVCZ/hzwgcr4j4BPtOozPJgf7qoaeiYAS0m/6LYE7srN8mXAL3I5ETGf9OVzlKQtgaOBq/M6dgSOrS2Xl30H6cupq/HAk5XxJ0lfXtWukYVdpo+vbOe6yjbmAa/0smxXz1Vjioh7ImI08H9Iv8K7Gkv61dw13gmV8acjf6t0jVfSEZLmKB3UXkZqkY3tJb5XiYh5EXFiREwEds/rvjCvf3tJM3KXz/PAlb2sv7e6u4KU6Gfkbp9vSBrRS1jHRMTo2gM4tTJtPLAwIv7UpU5qdfZXpHp4UtJ/Sdq/l+10t6+Mr0yrvtfdve/rlEnaV9KtudtxOfAxXl1fz1SGX+hmfFQv8Q5ZThxDiNLZOROA3wDPkj4Yb6p8KWwTEdUPyjXAB4GpwIM5mUD6gF5R/TKJiJERcV43m11E+hKreS3pl3T1Azqpy/RFle0c0WU7m0fE05X5e7u8883AoblPvS+eJf0q7xpvdXsTJKlrvLkv/kfA+aQWwmjgBlLLrl8i4iHSL+ndc9HXSK/3LRGxNfA3vay/x7qLiJcj4pyI2A04gHTiwAn9DHMRMKnLcaD/rbOIuCMippK6f34CzKy9vB7W1bXua/vCYlK3W011n6npus6rSS22SRGxDalbqt/vh63lxDEESNpa6UD3DODKiLg//0L8D+ACSdvn+SZIOqyy6AxS3+/fsra1AemX7lGSDpM0LB9sbZNU/WDXXAP8o6SdJI0Cvgr8MCLWVOY5W9KWkt4EnAT8MJdfBJwraccc33aSpha89MtJXzjXSdq9FiswpbuZI+IV0hfbuUqn8e4I/FN+vTXbA/8gaYSkY0ldSzeQWiqbkY7RrJF0BKnu+kzSGyWdXqtHSZNIiXtOnmUrcreRpAnAp3pZXY91J+kgSW9WOvPoeVKyfKUk1orbSV2en8510gYcRWrNbCrpeEnbRMTLeVu17TwDvEb5RInsGuCsHOtY0nG2Wt3PBE5SOhC/ZZ62PlsBSyNitdLp13/dz9doXThxbNx+KmkF6dfn54Bvkr6Ya84gHdick7s+fkU6BgFARCwGfkv6VfrDSvlCUivks6QvyoWkL7Hu9qfppK6RXwNPkA5A/n2Xef4rx3EzcH5E/DKXf4v0i/GX+XXMAfbt64uPiNXAQcCDwM/JxzaAvYH397DY35O+CB8ntcyuzq+h5nZgF1Lr5FzgfRHxXESsAP6B9AXXSfqSmtXXWLMVpNd3ez4zaA4wF6idaXYOsCfpIPDPSSca9KS3uvsz4FpSfcwj1f+V3a1kfSLiJVI35hGkOvkOcEJuLUE6s21B3r8+Rmol1VpT1wCP5+608cBXgDuB+4D7gd/nMiLiRuDbwK2kfeW3ef29nap8KvCl/Po/z9rWjg2Q1u2uNWseSZNJyWRElxaIWa8k7UpKqpt532k+tzjMbIMg6S9z99cY4OvAT500WsOJw6yJJL1Wa//01vXx2lbHN8h9lNQ1+hjpWMnftjacoctdVWZmVsQtDjMzK/KqC8NtbMaOHRuTJ0/u9/KrVq1i5Mi+/g2geRxXGcdVxnGV2Rjjuuuuu56NiO26ndjqv643+rHXXnvFQNx6660DWr5RHFcZx1XGcZXZGOMC7gxfcsTMzOrBicPMzIo4cZiZWREnDjMzK+LEYWZmRZw4zMysiBOHmZkVceIwM7MiThxmZlZko7/kiJkNLjNvm9vvZUesWt3v5d9/wO7rn8n6xC0OMzMr4sRhZmZFnDjMzKyIE4eZmRVx4jAzsyJOHGZmVqRpiUPSAkn3S7pH0p25bFtJsyU9mp/H5HJJ+rak+ZLuk7RnZT3T8vyPSprWrPjNzCxpdovjoIjYIyKm5PEzgZsjYhfg5jwOcASwS36cAnwXUqIBvgDsC+wDfKGWbMzMrDla3VU1FbgsD18GHFMpvzzfwXAOMFrSDsBhwOyIWBoRncBs4PBmB21mNpQp3Vq2CRuSngA6gQC+FxEXS1oWEaMr83RGxBhJPwPOi4jf5PKbgTOANmDziPhKLj8beCEizu+yrVNILRXGjRu314wZM/od98qVKxk1alS/l28Ux1XGcZVpZFydq1b3e1mteYkYvmm/lh0zcvN+b3d9Nsb38aCDDrqr0ju0jmZecuTtEbFI0vbAbEkP9TKvuimLXsrXLYi4GLgYYMqUKdHW1taPcJP29nYGsnyjOK4yjqtMI+Ma0CVHOjt4eczEfi3b1sBLjgy197FpXVURsSg/LwGuIx2jeCZ3QZGfl+TZO4BJlcUnAot6KTczsyZpSuKQNFLSVrVh4FBgLjALqJ0ZNQ24Pg/PAk7IZ1ftByyPiMXATcChksbkg+KH5jIzM2uSZnVVjQOuk1Tb5tUR8QtJdwAzJZ0MPAUcm+e/ATgSmA/8ETgJICKWSvoycEee70sRsbRJr8HMzGhS4oiIx4G3dlP+HHBIN+UBnNbDuqYD0+sdo5mZ9U2rT8c1M7MNjBOHmZkVceIwM7MiThxmZlbEicPMzIo4cZiZWREnDjMzK+LEYWZmRZw4zMysiBOHmZkVceIwM7MiThxmZlbEicPMzIo4cZiZWREnDjMzK+LEYWZmRZw4zMysiBOHmZkVceIwM7MiThxmZlbEicPMzIo4cZiZWREnDjMzK+LEYWZmRZw4zMysiBOHmZkVceIwM7MiThxmZlbEicPMzIo4cZiZWZGmJg5JwyTdLelneXwnSbdLelTSDyVtmss3y+Pz8/TJlXV8Jpc/LOmwZsZvZmbNb3F8HJhXGf86cEFE7AJ0Aifn8pOBzojYGbggz4ek3YDjgDcBhwPfkTSsSbGbmRlNTBySJgLvAS7J4wIOBq7Ns1wGHJOHp+Zx8vRD8vxTgRkR8WJEPAHMB/ZpziswMzMARURzNiRdC3wN2Ar4JHAiMCe3KpA0CbgxInaXNBc4PCI68rTHgH2BL+Zlrszll+Zlru2yrVOAUwDGjRu314wZM/od98qVKxk1alS/l28Ux1XGcZVpZFydq1b3e1mteYkYvmm/lh0zcvN+b3d9Nsb38aCDDrorIqZ0N234gKLqI0nvBZZExF2S2mrF3cwa65nW2zJrCyIuBi4GmDJlSrS1tXWdpc/a29sZyPKN4rjKOK4yjYxr5m1z+73siM4OXh4zsV/Lth2we7+3uz5D7X1sSuIA3g4cLelIYHNga+BCYLSk4RGxBpgILMrzdwCTgA5Jw4FtgKWV8prqMmZm1gRNOcYREZ+JiIkRMZl0cPuWiDgeuBV4X55tGnB9Hp6Vx8nTb4nUpzYLOC6fdbUTsAvwu2a8BjMzS5rV4ujJGcAMSV8B7gYuzeWXAldImk9qaRwHEBEPSJoJPAisAU6LiFeaH7bZhm193UUjVq0eUJeSbdyanjgioh1oz8OP081ZURGxGji2h+XPBc5tXIRmZtYb/3PczMyKOHGYmVkRJw4zMyvixGFmZkWcOMzMrIgTh5mZFXHiMDOzIk4cZmZWxInDzMyKOHGYmVkRJw4zMyvixGFmZkWcOMzMrIgTh5mZFXHiMDOzIn1OHJL+QdLYRgZjZmaDX0mL413AAkk/k/QBSZs1KigzMxu8+pw4IuJoYEfgRuATwP9IukTSXzQqODMzG3yKjnFExHMR8e8RsT9wILA3cKukBZI+J2lUQ6I0M7NBo/jguKRDJH2fdN/wZ4ATgA8BbyO1RszMbCM2vK8zSjofOA5YDlwOnBURT1emzwE66x6hmZkNKn1OHMDmwF9GxB3dTYyIlyVNqU9YZmY2WJUkjq8Bf6wWSBoDbBERiwAi4qE6xmZmZoNQyTGOnwATu5RNBK6rXzhmZjbYlSSON0TE/dWCPP7G+oZkZmaDWUniWCJp52pBHn+uviGZmdlgVpI4pgM/kvReSbtJOgq4FrikMaGZmdlgVHJw/DzgZeB8YBKwkJQ0vtmAuMzMbJDqc+KIiD8B/5wfZmY2RJW0OJD0BuCtwDqXFomI6fUMyszMBq+Sy6p/FrgXOJ10iZHa42/6sOzmkn4n6V5JD0g6J5fvJOl2SY9K+qGkTXP5Znl8fp4+ubKuz+TyhyUdVvJizcxs4EpaHJ8A9omI+/qxnReBgyNipaQRwG8k3Qj8E3BBRMyQdBFwMvDd/NwZETtLOg74OvABSbuRLnvyJmA88CtJfx4Rr/QjJjMz64eSs6peAPr1z/BIVubREfkRwMGkM7MALgOOycNT8zh5+iGSlMtnRMSLEfEEMB/Ypz8xmZlZ/5QkjrOBf5W0g6RNqo++LCxpmKR7gCXAbOAxYFlErMmzdAAT8vAE0llb5OnLgddUy7tZxszMmqCkq+oH+fkjlTKRWg7D1rdw7k7aQ9Jo0mVKdu1utsp6u5vWU/k6JJ0CnAIwbtw42tvb1xdej1auXDmg5RvFcZVxXOsasWp1r9O15iVGdHY0KZq+G0hc7e3P1jmatYba/lWSOHaqxwYjYpmkdmA/YLSk4blVMRFYlGfrIP1XpEPScGAbYGmlvKa6THUbFwMXA0yZMiXa2tr6HW97ezsDWb5RHFcZx7WumbfN7XX6iM4OXh7T9dJ0rTeQuNoO2L3O0aw11PavklvHPhkRT5K6il6qjeeyXknaLrc0kLQF6f7l84Bbgffl2aYB1+fhWXmcPP2WiIhcflw+62onYBfgd319DWZmNnAlN3IaDXyH9EX+MjBS0tGkM63OWs/iOwCXSRpGSlYzI+Jnkh4EZkj6CnA3cGme/1LgCknzSS2N4wAi4gFJM4EHgTXAaT6jysysuUq6qi4i3eFvR9IXN8BvgX8Bek0c+RTet3VT/jjdnBUVEauBY3tY17nAuQVxm5lZHZUkjkOA8flOfwEQEX+QtH1jQjMzs8Go5HTc5cDYaoGk1wKL6xqRmZkNaiWJ4xLSZdUPAjaRtD/pT3oXNSQyMzMblEq6qr4OrAb+nfTP7+nA94BvNSAuMzMbpEouqx7AhflhZmZDVMnpuAf3NC0ibqlPOGZmNtiVdFVd2mV8O2BT0r+5X1e3iMzMbFAr6apa55Ij+c98ZwEr6h2UmZkNXiVnVa0j/2P7XODT9QvHzMwGu34njuzdwJ/qEYiZmW0YSg6OL2TdS5hvCWwOnFrvoMzMbPAqOTje9d7iq4BHIuL5OsZjZmaDXMnB8f9qZCBmZrZhKOmquoJu7rbXVUScMKCIzMxsUCs5OL4MOIZ0m9iOvOzUXP5Y5WFmZhuxkmMcfw68JyL+X61A0juAsyPisLpHZmZmg1JJi2M/YE6XstuB/esXjpmZDXYlLY67ga9K+nxEvJDvHX4OcE9jQjPb+HWuWs3M2+a2OgyzIiWJ40TgamC5pE5gDHAncHwD4jIzq6tGJugRvfwAeP8Buzdsu61ScjruAuAASZOA8cDiiHiqUYGZmdngVHTJEUmvAdqAAyPiKUnjJU1sSGRmZjYo9TlxSDoQeJjUNXV2Lt4F+G4D4jIzs0GqpMVxIfCBiDgcWJPLbgf2qXtUZmY2aJUkjskRcXMerv2D/CXKDrCbmdkGriRxPCip6x/93gXcX8d4zMxskCtpLZwO/EzSz4EtJH0POIp02REzMxsi+tziiIg5wFuAB4DpwBPAPhFxR4NiMzOzQahPLY58f/GbgcMi4huNDcnMzAazPrU48v3Fd+rr/GZmtvEqSQTnAN+VtKOkYZI2qT0aFZyZmQ0+JQfHL8nPJ7D2dFzl4WH1DMrMzAav9bYWJP1ZHtyp8nhdftSG17eOSZJulTRP0gOSPp7Lt5U0W9Kj+XlMLpekb0uaL+k+SXtW1jUtz/+opGnFr9jMzAakL91MjwBExJMR8SRwQW24UrY+a4DTI2JX0n09TpO0G3AmcHNE7EI6+H5mnv8I0uVMdgFOIV/WRNK2wBeAfUn/WP9CLdmYmVlz9CVxqMt4W+lGImJxRPw+D68A5gETSP8BuSzPdhnp1rTk8ssjmQOMlrQDcBgwOyKWRkQnMBs4vDQeMzPrv74c44j1z9J3kiYDbyNd52pcRCyGlFwkbZ9nmwAsrCzWkct6Ku+6jVNILRXGjRtHe3t7v+NduXLlgJZvFMdVZrDGpTUvMaKzo9VhvIrjKtNbXO3tzzY5mrUatd/3JXEMl3QQa1seXceJiFv6sjFJo4AfAZ+IiOelro2ZtbN2Uxa9lK9bEHExcDHAlClToq2trS/hdau9vZ2BLN8ojqvMYI3rup//gpfHDL47E4zo7HBcBXqLq62FN3Jq1H7fl8SxhPRP8ZrnuowHfTtAPoKUNK6KiB/n4mck7ZBbGzvkbUFqSUyqLD4RWJTL27qUt/fhNZiZWZ2s9xhHREyOiJ16efQlaQi4FJgXEd+sTJoF1M6MmgZcXyk/IZ9dtR+wPHdp3QQcKmlMPih+aC4zM7MmadYl0d8OfAi4X9I9ueyzwHnATEknA08Bx+ZpNwBHAvOBPwInAUTEUklfBmrXx/pSRCxtzkswMzNoUuKIiN/Q/fEJgEO6mT+A03pY13TW7SozM7Mm8uVCzMysiBOHmZkVceIwM7MiThxmZlbEicPMzIo4cZiZWREnDjMzK+LEYWZmRZw4zMysiBOHmZkVceIwM7MiThxmZlbEicPMzIo4cZiZWREnDjMzK+LEYWZmRZw4zMysiBOHmZkVceIwM7MiThxmZlbEicPMzIo4cZiZWREnDjMzKzK81QGYtdrM2+a2bNsjWrZls/5zi8PMzIo4cZiZWREnDjMzK+LEYWZmRZw4zMysiBOHmZkVaUrikDRd0hJJcytl20qaLenR/Dwml0vStyXNl3SfpD0ry0zL8z8qaVozYjczs3U1q8XxA+DwLmVnAjdHxC7AzXkc4Ahgl/w4BfgupEQDfAHYF9gH+EIt2ZiZWfM0JXFExK+BpV2KpwKX5eHLgGMq5ZdHMgcYLWkH4DBgdkQsjYhOYDavTkZmZtZgrfzn+LiIWAwQEYslbZ/LJwALK/N15LKeyl9F0imk1grjxo2jvb2930GuXLlyQMs3iuMq01tcI1atbm4wFVrzEiM6O1q2/Z44rjK9xdXe/myTo1mrUZ/HwXjJEXVTFr2Uv7ow4mLgYoApU6ZEW1tbv4Npb29nIMs3iuMq01tcLb3kSGcHL4+Z2LLt98RxlektrrYDdm9yNGs16vPYyrOqnsldUOTnJbm8A5hUmW8isKiXcjMza6JWJo5ZQO3MqGnA9ZXyE/LZVfsBy3OX1k3AoZLG5IPih+YyMzNroqZ0VUm6BmgDxkrqIJ0ddR4wU9LJwFPAsXn2G4AjgfnAH4GTACJiqaQvA3fk+b4UEV0PuJuZWYM1JXFExAd7mHRIN/MGcFoP65kOTK9jaGZmVsj/HDczsyJOHGZmVsSJw8zMijhxmJlZEScOMzMr4sRhZmZFnDjMzKyIE4eZmRVx4jAzsyJOHGZmVsSJw8zMijhxmJlZEScOMzMr4sRhZmZFnDjMzKyIE4eZmRVpyo2czPpi5m1zG7buEatWN3T9ZkOJWxxmZlbEicPMzIo4cZiZWREf4zAza6BWHlvbvkHrdYvDzMyKOHGYmVkRJw4zMyvixGFmZkWcOMzMrIgTh5mZFXHiMDOzIk4cZmZWxInDzMyK+J/j69HZoquqvv+A3Zu+TRj4v1x9FVqzjd8GmTgkHQ58CxgGXBIR57U4pLpb35evv6DNrFU2uK4qScOAfweOAHYDPihpt9ZGZWY2dGxwiQPYB5gfEY9HxEvADGBqi2MyMxsyNsSuqgnAwsp4B7BvdQZJpwCn5NGVkh4ewPbGAs8OYPlGcVxlHFcZx1VmY4xrx54mbIiJQ92UxTojERcDF9dlY9KdETGlHuuqJ8dVxnGVcVxlhlpcG2JXVQcwqTI+EVjUoljMzIacDTFx3AHsImknSZsCxwGzWhyTmdmQscF1VUXEGkl/B9xEOh13ekQ80MBN1qXLqwEcVxnHVcZxlRlScSki1j+XmZlZtiF2VZmZWQs5cZiZWZEhlzgkTZe0RNLcStlbJf1W0v2Sfipp61w+QtJluXyepM9Uljlc0sOS5ks6cxDFtSCX3yPpzibHtamk7+fyeyW1VZbZK5fPl/RtSd2dVt2KuNrz+3hPfmw/wLgmSbo1vy8PSPp4Lt9W0mxJj+bnMblcuT7mS7pP0p6VdU3L8z8qadogiuuVSn0N6MSUfsT1xvwevyjpk13WVbfPZJ3jqttnsh9xHZ/fv/sk3SbprZV19b++ImJIPYC/APYE5lbK7gAOzMMfBr6ch/8amJGHtwQWAJNJB+UfA14HbArcC+zW6rjy+AJgbIvq6zTg+3l4e+AuYJM8/jtgf9L/cG4EjhgkcbUDU+pYXzsAe+bhrYBHSJfG+QZwZi4/E/h6Hj4y14eA/YDbc/m2wOP5eUweHtPquPK0lS2sr+2BvYFzgU9W1lPXz2S94srTFlCnz2Q/4jqgtt+QLtNU278GVF9DrsUREb8GlnYpfgPw6zw8G/ir2uzASEnDgS2Al4DnacBlT+oUV90VxrUbcHNebgmwDJgiaQdg64j4baS99nLgmFbHNZDt9xLX4oj4fR5eAcwjXe1gKnBZnu0y1r7+qcDlkcwBRuf6OgyYHRFLI6Izv57DB0FcdVUaV0QsiYg7gJe7rKqun8k6xlVX/Yjrtrz/AMwh/e8NBlhfQy5x9GAucHQePpa1fzC8FlgFLAaeAs6PiKV0f9mTCYMgLkhJ5ZeS7lK69Eoj9BTXvcBUScMl7QTsladNINVRTbPrq6e4ar6fuxHOlgbWhVYlaTLwNuB2YFxELIb04Sf9QoWe96WG7WMDjAtgc0l3SpojaUA/APoRV09aXV+9achnsh9xnUxqRcIA68uJI/kwcJqku0jNv5dy+T7AK8B4YCfgdEmvow+XPWlRXABvj4g9Sc3S0yT9RRPjmk7aAe8ELgRuA9bQ+vrqKS6A4yPizcA78+ND9QhE0ijgR8AnIqK31mBPddOQOqtDXACvjXQZi78GLpT0+ibG1eMquilrZn31pu6fydK4JB1EShxn1Iq6ma3P9eXEAUTEQxFxaETsBVxD6vuD9MH4RUS8nLs4/pvUxdGUy570Iy4iYlF+XgJcR0oyTYkrItZExD9GxB4RMRUYDTxKqq+JlVU0tb56iYuIeDo/rwCupg71JWkE6UN9VUT8OBc/U+vqyc9LcnlP+1Ld97E6xVXdxx4nHSN6WxPj6kmr66tH9f5MlsYl6S3AJcDUiHguFw+ovpw4AOUzaSRtApwFXJQnPQUcnM8wGUk6SPgQTbrsSWlckkZK2iovMxI4lNR905S4JG2Zt4ukdwNrIuLB3HReIWm/3BV0AnB9q+PKXVdjc/kI4L0MsL7y67sUmBcR36xMmgXUzoyaxtrXPws4Ib+X+wHLc33dBBwqaUw+Q+bQXNbSuHI8m+V1jgXeDs2iZSQAAATsSURBVDzYxLh6UtfPZL3iqvdnsjQuSa8Ffgx8KCIeqcw/sPrq61H0jeVB+iW6mHQQq4PUfPs46eyER4DzWPuP+lHAfwIPkD4cn6qs58g8/2PA5wZDXKQzJO7NjwdaENdk4GHSAbtfATtW1jOF9IF5DPi32jKtjAsYSTrD6r5cX98Chg0wrneQmvz3Affkx5HAa0gH6B/Nz9vm+UW6MdljwP1UzvAidb3Nz4+TBkNcpLN07s/72P3AyU2O68/y+/086SSHDtKJF1DHz2S94qLOn8l+xHUJ0FmZ987KuvpdX77kiJmZFXFXlZmZFXHiMDOzIk4cZmZWxInDzMyKOHGYmVkRJw4zMyvixGGWSfqBpK/0cd6QtHM/t7NA0rv6s2wf1/9FSVc2av1mThw2pEg6TtLtklYp3c/jdkmn1vPihvUgabTSPUf+R9IKSY9IOmP9S5o1nhOHDRmSTif9O/yfSf/0HQd8jHTZjE1bGFp3LiBdIWBXYBvSVX8f63UJsyZx4rAhQdI2wJeAUyPi2ohYEcndEXF8RLzYzTL/V+nuaEslzZI0vsssR0p6XNKzkv45XyMLSa+XdIuk5/K0qySNLgx5b+DqiOiMiD9FuoDjtZXYviVpoaTnlS7X/c5eXvt+Snd/W6ZX3wHxxPwaVkh6QtLxhXHaEOTEYUPF/sBm9PHiipIOBr4GvJ9017UnSTe7qfpL0jW49iTdBOfDtcXzsuNJLYZJwBcL450DnCvpJEm7dDP9DmAP0h0Crwb+U9Lm3byOCcDPga/keT8J/EjSdvmie98m3Y1xK9J1qO4pjNOGICcOGyrGAs9GRO0+HFR+hb+gV98j4XhgekT8PrdGPgPsr3TznJqvR7pD31Ok+3x8ECAi5kfE7Ih4MSL+AHwTOLAw3r8HrgL+Dngwt3yOqE2MiCsj4rlIl4z/F1JSfEM36/kb4IaIuCG3XGaT7ktyZJ7+J2B3SVtEurvcA4Vx2hDkxGFDxXPAWKXb7QIQEQdExOg8retnYTyplVGbd2Wer3qXtOod1J7MyyBpe0kzJD0t6XngSlLi6rOIeCEivhrp3iKvAWaSWhXb5m2cLmmepOWSlpGOg3S3jR2BY3OCXJbnfQewQ0SsAj5AOs6zWNLPJb2xJE4bmpw4bKj4LfAifb+v8iLSly7wv/dSeA3wdGWe6o1wXsvaG+F8jXTp67dExNakX/39Pmsr0h3evkq6DPxO+XjGGaRutDE5+S3vYRsLgSsiYnTlMTIizsvrviki3k3qjnsI+I/+xmlDhxOHDQkRsQw4B/iOpPdJGiVpE0l7kL6Qu7oaOEnSHvnGRV8Fbo+IBZV5PpVvbDSJdC+QH+byrYCVwLJ8jOFTpfEq3f98b0mb5mMXHyfd5+HhvP41wB+A4ZI+T7r3Q3euBI6SdJikYZI2l9QmaaKkcZKOzknxxRzzK6Wx2tDjxGFDRkR8A/gn4NOkW2s+A3yP9Ov9ti7z3gycTbpF52Lg9aS7pFVdT7oR1D2kA9CX5vJzSAfMl+fyH1MugO8Dz5JaMu8G3pO7zG4CbiTdhOdJYDXrdptVX8dCUivrs6REs5CUyDbJj9Pz+peSjsOc2o9YbYjxjZzMzKyIWxxmZlbEicOsRSTdKGllN4/Ptjo2s964q8rMzIq4xWFmZkWcOMzMrIgTh5mZFXHiMDOzIv8fEgSYdA1YC3gAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import matplotlib as mpl\n",
    "import scipy.stats as spstats\n",
    "\n",
    "fig, ax = plt.subplots()\n",
    "bin_df['Year'].hist(color='#A9C5D3')\n",
    "ax.set_title('Developer Global_Sales Hostogram', fontsize=12)\n",
    "ax.set_xlabel('Global_Sales', fontsize=12)\n",
    "ax.set_ylabel('Frequency', fontsize=12)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "这样区间就出来了，我们可以分成多个区间，如1980-1985是一个区间，1986-1990是一个区间"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Name</th>\n",
       "      <th>Year</th>\n",
       "      <th>Year_bin</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Wii Sports</td>\n",
       "      <td>2006.0</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Super Mario Bros.</td>\n",
       "      <td>1985.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Mario Kart Wii</td>\n",
       "      <td>2008.0</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Wii Sports Resort</td>\n",
       "      <td>2009.0</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Pokemon Red/Pokemon Blue</td>\n",
       "      <td>1996.0</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Tetris</td>\n",
       "      <td>1989.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>New Super Mario Bros.</td>\n",
       "      <td>2006.0</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Wii Play</td>\n",
       "      <td>2006.0</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>New Super Mario Bros. Wii</td>\n",
       "      <td>2009.0</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>Duck Hunt</td>\n",
       "      <td>1984.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                        Name    Year  Year_bin\n",
       "0                 Wii Sports  2006.0         5\n",
       "1          Super Mario Bros.  1985.0         1\n",
       "2             Mario Kart Wii  2008.0         6\n",
       "3          Wii Sports Resort  2009.0         6\n",
       "4   Pokemon Red/Pokemon Blue  1996.0         3\n",
       "5                     Tetris  1989.0         2\n",
       "6      New Super Mario Bros.  2006.0         5\n",
       "7                   Wii Play  2006.0         5\n",
       "8  New Super Mario Bros. Wii  2009.0         6\n",
       "9                  Duck Hunt  1984.0         0"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gle = LabelEncoder()  # 实例化\n",
    "bin_df['Year_bin'] = pd.cut(bin_df['Year'], 9)  # 切分成9组，也可以自己指定切分区间\n",
    "bin_df['Year_bin'] = bin_df['Year_bin'].astype(str)  # 转换类型为字符串\n",
    "bin_year = gle.fit_transform(bin_df['Year_bin'])  # 利用LabelEncoder方法变成1-9的数值\n",
    "bin_df['Year_bin'] = bin_year  # 赋值到新的列\n",
    "bin_df.head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 对数变换\n",
    "\n",
    "经常有这样的假设：数据的分布是正态分布。如线性回归的时候误差项要满足正态分布，而当数据不满足的时候，则需要把数据变换成正态分布"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Name</th>\n",
       "      <th>NA_Sales</th>\n",
       "      <th>NA_Sales_log</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Wii Sports</td>\n",
       "      <td>41.49</td>\n",
       "      <td>3.749269</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Super Mario Bros.</td>\n",
       "      <td>29.08</td>\n",
       "      <td>3.403860</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Mario Kart Wii</td>\n",
       "      <td>15.85</td>\n",
       "      <td>2.824351</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Wii Sports Resort</td>\n",
       "      <td>15.75</td>\n",
       "      <td>2.818398</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Pokemon Red/Pokemon Blue</td>\n",
       "      <td>11.27</td>\n",
       "      <td>2.507157</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       Name  NA_Sales  NA_Sales_log\n",
       "0                Wii Sports     41.49      3.749269\n",
       "1         Super Mario Bros.     29.08      3.403860\n",
       "2            Mario Kart Wii     15.85      2.824351\n",
       "3         Wii Sports Resort     15.75      2.818398\n",
       "4  Pokemon Red/Pokemon Blue     11.27      2.507157"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_log = vg_df[['Name','NA_Sales']] \n",
    "df_log['NA_Sales_log'] = np.log((1+df_log['NA_Sales']))\n",
    "df_log.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x18ec49c7b38>"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAD4CAYAAAAO9oqkAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAdgUlEQVR4nO3dfbBV1Znn8e9vQPNi2ga0IYRLGjKNGsOM3UoJJinmljYv2imxUo4tSY0YqKIqjT127E6EydTQ0RBlZiq2ONGULbTY2iBtx5IyBkTkTjK24HsrSNAbtOQqSqh7fUET9Zpn/tjryOFwzuXe87rl/D5Vt+7Za699znNgw7PXXmuvpYjAzMza279rdQBmZtZ6TgZmZuZkYGZmTgZmZoaTgZmZAcNbHUC1TjzxxJgwYULZfW+//TbHHXdccwM6grzFlLd4IF8xPf744/sj4g+a/bkfhfM6D3E4huriGPC8joiP5M8ZZ5wRlWzZsqXivlbJW0x5iyciXzEBj4XP67LyEIdjOGgocQx0Xvs2kZmZORmYmZmTgZmZ4WRgZmY4GZiZGU4GZmaGk4GZmeFkYGZmOBmYmRkf4ekoBtL39m9Z96/bh3TMRV+c3KBozOqjmvMafG7b4ByxZSBplaR9kg47CyX9jaSQdGLalqQVkrolPS3p9KK68yQ9n37mFZWfIemZdMwKSarXlzMzs8EZzG2iW4HZpYWSxgMzgJeKis8FJqWfhcBNqe4oYCkwFTgTWCppZDrmplS3cNxhn2VmZo11xGQQET8Hesvsug74DlC8iPIc4LY0J9JWYISkscAsYFNE9EZEH7AJmJ32HR8RD6dJlG4DLqjtK5kNzvz58xk9ejSTJx96G+WGG24AmCxph6T/WSiXtCS1YHdJmlVUPjuVdUtaXFQ+UdK21Bq+U9KxTfhaZlWpqs9A0vnAyxHxbyV3dcYBe4q2e1LZQOU9Zcorfe5CslYEY8aMoaurq3y9/vc4pq+n7L5Kurr2D6n+UB04cKBivK2Qt3ig+TGddtppTJs2jWuuuebDz33yySe5/fbbAXZExBmSRgNIOhW4GPgC8BngAUknpbf6EVkruQd4VNL6iHgWWA5cFxFrJf0YWEBqLZvlzZCTgaRPAt8FZpbbXaYsqigvKyJuBm4GmDJlSnR2dpatd/dPN/D+yI5Kb1NWZ4M72bq6uqgUbyvkLR5ofkydnZ28+OKLrFix4sPPvfHGG1m+fDkzZswIgIjYl6rPAdZGxLvAC5K6yW55AnRHxG4ASWuBOZJ2AmcDX0t1VgN/i5OB5VQ1LYN/D0wECq2CDuAJSWeSXRmNL6rbAbySyjtLyrtSeUeZ+mYt8dxzz/GLX/wC4BRJ/xf4m4h4lKzFurWoanErtrTVOxU4AXg9IvrL1D9EI1u8UP9Wbx5alY6h/nEMORlExDPA6MK2pBeBKRGxX9J64LJ0dTQVeCMi9kraCPygqNN4JrAkInolvSVpGrANuAS4obavZFa9/v5++vr6AH4JfBtYJ+lzVG7Flut3G1Krt5EtXqh/qzcPrUrHUP84BjO0dA3wMHCypB5JCwaofh+wG+gG/h74C4CI6AWuBh5NP1elMoBvArekY34F/Ky6r2JWu46ODr761a8CEBGPAL8DTmTgVm+58v1kAyiGl5Sb5dIRWwYRMfcI+ycUvQ5gUYV6q4BVZcofA/xUjOXCBRdcwIMPPghA6iA+luw/9vXAP0n6IVkH8iTgEbIWwCRJE4GXyTqZvxYRIWkLcCGwFpgH3NPkr2M2aJ6OwtrW3LlzOeuss9i1axcdHR2sXLmS+fPns3v3bshGDa0F5qWh0juAdcCzwAZgUUR8kPoELgM2AjuBdakuwJXAFamz+QRgZXO/odngHZXTUZgNxpo1a8qW33777dxxxx07ImJKcXlELAOWldaPiPvIbpGWlu/m4Igjs1xzy8DMzJwMzMzMycDMzHAyMDMznAzMzAwnAzMzw8nAzMxwMjAzM5wMzMwMJwMzM8PJwMzMcDIwMzOcDMzMDCcDMzPDycDMzHAyMDMznAysjc2fP5/Ro0czeXLZVVfHSApJJwIos0JSt6SnJZ1eqChpnqTn08+8ovIzJD2TjlkhSY3/VmbVcTKwtnXppZeyYcOGw8r37NkDcDzwUlHxuWTrHk8CFgI3AUgaBSwFppKtarZU0sh0zE2pbuG42Y34Hmb1cMRkIGmVpH2StheV/S9Jv0xXSHdLGlG0b0m6EtolaVZR+exU1i1pcVH5REnb0lXVnZKOrecXNKtk+vTpjBo16rDyb33rWwA9QBQVzwFuS+shbwVGSBoLzAI2RURvRPQBm4DZad/xEfFwRARwG3BBY7+RWfUGswbyrcD/ITuZCzYBSyKiX9JyYAlwpaRTgYvJFhP/DPCApJPSMT8CZpD9I3tU0vqIeBZYDlwXEWsl/RhYQLrqMmu29evXM27cOIDfAMcV7RoH7Cna7kllA5X3lCk/jKSFZC0IxowZQ1dXV9nY1P8ex/T1lN03kK6u/UM+ZiAHDhyoGGOzOIb6x3HEZBARP5c0oaTs/qLNrcCF6fUcYG1EvAu8IKmbgwuCd6cFwpG0FpgjaSdwNvC1VGc18Lc4GVgLvPPOOyxbtoz777+fFStWlO4ud78/qig/vDDiZuBmgClTpkRnZ2fZ+O7+6QbeH9lRPvgBdH6xbJ9I1bq6uqgUY7M4hvrHMZiWwZHMB+5Mr8eRJYeC4quh0qunqcAJwOsR0V+m/mEaeQVV76unUnm5iijIWzzQmpheffVV3n77bbq6uti9ezfPPfccJ598MsB/AIYBT0g6k+zcHF90aAfwSirvLCnvSuUdZeqb5VJNyUDSd4F+4I5CUZlqQfm+iSFdPUFjr6DqffVUKi9XEQV5iwdaE9OLL77IcccdR2dnJ52dncyfPx8ASc8AJwJTImK/pPXAZalVOxV4IyL2StoI/KCo03gm2S3UXklvSZoGbAMuAW5o6pczG4KqRxOlIXRfAb6eOshg4KuncuX7yTrihpeUmzXc3LlzOeuss9i1axcdHR2sXLlyoOr3AbuBbuDvgb8AiIhe4Grg0fRzVSoD+CZwSzrmV8DPGvJFzOqgqpaBpNnAlcB/ioh3inatB/5J0g/JOpAnAY+QtQAmSZoIvEzWyfy1iAhJW8j6HNYC84B7qv0yZkOxZs2aAfdHxISi1wEsqlBvFbCqTPljQGObnGZ1MpihpWuAh4GTJfVIWkA2uuj3gE2SnkqjgIiIHcA64FlgA7AoIj5IfQKXARuBncC6VBeypHJF6mw+ARjw8szMzOpvMKOJ5pYprvgfdkQsA5aVKb+PrKldWr6bgyOOzMysBfwEspmZORmYmZmTgZmZ4WRgZmY4GZiZGU4GZmaGk4GZmeFkYGZmOBmYmRlOBmZmhpOBmZnhZGBmZjgZmJkZTgZmZoaTgbWx+fPnM3r0aCZPPrj+zLe//W1OOeUUgFMl3S1pRGGfpCWSuiXtkjSrqHx2KuuWtLiofKKkbZKel3SnpGOb9NXMhszJwNrWpZdeyoYNGw4pmzFjBtu3b4dsgabngCUAkk4lW6HvC8Bs4EZJwyQNA34EnAucCsxNdQGWA9dFxCSgD1jQ8C9lViUnA2tb06dPZ9SoUYeUzZw5k+HDP1zzaSvZutwAc4C1EfFuRLxAtq7xmemnOyJ2R8R7ZMu3zpEk4GzgrnT8auCCRn4fs1pUtQayWZuYD9yZXo8jSw4FPakMYE9J+VSyJVxfT0u+ltY/hKSFwEKAMWPG0NXVVTYY9b/HMX09Q/4SXV37h3zMQA4cOFAxxmZxDPWPw8nArLxPk/0nf0faVpk6QfnWdQxQ//DCiJuBmwGmTJkSnZ2dZQO6+6cbeH9kR9l9A+n84uQjVxqCrq4uKsXYLI6h/nE4GZiVWL16NcAI4OsRUfgPvAcYX1StA3glvS5Xvh8YIWl4ah0U1zfLnSP2GUhaJWmfpO1FZaMkbUqjJDZJGpnKJWlFGlXxtKTTi46Zl+o/L2leUfkZkp5Jx6xI91rNWmLDhg0sX74csn6Ad4p2rQculvQxSROBScAjwKPApDRy6FiyTub1KYlsAS5Mx88D7mnW9zAbqsF0IN9KNnqi2GJgcxolsTltQzaiYlL6WQjcBFnyAJaS3Us9E1haSCCpzsKi40o/y6wh5s6dy1lnncWuXbvo6Ohg5cqVXHbZZbz11lsAJ0l6StKPASJiB7CObJTRBmBRRHyQrvovAzYCO4F1qS7AlcAVkrrJ+hBWNvcbmg3eEW8TRcTPJU0oKZ4DdKbXq4EushN/DnBbuiraKmmEpLGp7qaI6AWQtAmYLakLOD4iHk7lt5GNuPhZLV/KbDDWrFlzWNmCBdnoT0nPRsSU4n0RsQxYVnpMRNwH3FemfDfZxY9Z7lXbZzAmIvYCRMReSaNT+TgOH1kx7gjlPWXKy2rkqIt6j7golZeRBwV5iwfyGZNZu6h3B3KlERRDLS+rkaMu6j3iolReRh4U5C0eyGdMZu2i2ofOXku3f0i/96XySiMuBirvKFNuZmZNVG0yWE82OgIOHSWxHrgkjSqaBryRbidtBGZKGpk6jmcCG9O+tyRNS6OILsEjLszMmu6It4kkrSHrAD5RUg/ZqKBrgXWSFgAvAf85Vb8POI/sUf13gG8ARESvpKvJhuEBXFXoTAa+STZi6RNkHcfuPDYza7LBjCaaW2HXOWXqBrCowvusAlaVKX8MaOwNezMzG5AnqjMzMycDMzNzMjAzM5wMzMwMJwMzM8PJwMzMcDIwMzOcDMzMDCcDMzPDycDMzHAysDY2f/58Ro8ezeTJB2dD6e3tZcaMGQCTvaSrtRMnA2tbl156KRs2bDik7Nprr+Wcc84B2I6XdLU24mRgbWv69OmMGjXqkLJ77rmHefM+vLhfTbYMKxQt6RoRW4HCkq6zSEu6RkQfUFjSdSxpSdc0geNtRe9lljv1XunM7CPttddeY+zYsUDzlnRt5HKuUP8lXfOwPKljqH8cTgZmg9OwJV0buZwr1H9J1zwsT+oY6h+HbxOZFRkzZgx79+4FvKSrtRcnA7Mi559/PqtXry5seklXaxu+TWRta+7cuXR1dbF//346Ojr43ve+x+LFi7nooosgW33vDbykq7UJJwNrW2vWrClbvnnzZiRtj4gPl3b1kq52tPNtIjMzqy0ZSPqWpB2StktaI+njkiZK2paexrxT0rGp7sfSdnfaP6HofZak8l2SZtX2lczMbKiqTgaSxgH/FZgSEZOBYcDFwHLguoiYBPQBC9IhC4C+iPgj4LpUD0mnpuO+QPaE5o2ShlUbl5mZDV2tt4mGA5+QNBz4JLAXOBu4K+0vfYKzMEzjLuCcNMpiDrA2It6NiBfIOujOrDEuMzMbgqo7kCPiZUn/G3gJ+A1wP/A48HpE9KdqxU9dfvikZkT0S3oDOCGVby1665Y8qVnvpzRL5eVpxYK8xQP5jMmsXVSdDNKY6jnAROB14J/JJvMqVXjqMtdPatb7Kc1SeXlasSBv8UA+YzJrF7XcJvpT4IWI+HVEvA/8BPgi2QRehSRT/NTlh09qpv2/D/RS+QlOMzNrklqSwUvANEmfTPf+zwGeBbYAF6Y6pU9wFqaDvBB4MI3dXg9cnEYbTSSb6veRGuIyM7MhqqXPYJuku4AngH7gSbJbOD8F1kr6fipbmQ5ZCfyjpG6yFsHF6X12SFpHlkj6gUUR8UG1cZmZ2dDV9ARyRCwlW9ij2G7KjAaKiN9y8NH+0n3LgGW1xGJmZtXzE8hmZuZkYGZmTgZmZoaTgZmZ4WRgZmY4GZiZGU4GZpWM9vTs1k6cDMxKvPzyywBj8PTs1kacDMzKE56e3dqIk4FZiXHjxgG8Sjb/1l7gDYYwPXuqf0JxeZljzHKlpukozI5GfX19ACNo0vTsjVynA+q/Vkce1p1wDPWPw8nArMQDDzwA8G5E/BpA0iHTs6er/3LTs/dUMz17I9fpgPqv1ZGHdSccQ/3j8G0isxKf/exnAT7l6dmtnbhlYFZi6tSpkI0W8vTs1jacDMzKeyUippSUeXp2O2r5NpGZmTkZmJmZk4GZmeFkYGZmOBmYmRk1JgNJIyTdJemXknZKOkvSKEmb0syOmySNTHUlaUWawfFpSacXvc+8VP95SfMqf6KZmTVCrS2D64ENEXEKcBqwE1gMbE4zO25O25A9zj8p/SwEbgKQNApYCkwlG7a3tJBAzMysOapOBpKOB6aTHryJiPci4nUOncGxdGbH2yKzlezR/rHALGBTRPRGRB+wiWy6XzMza5JaHjr7HPBr4B8knUY2q+PlwJiI2AsQEXsljU71K83gOOiZHRs5oVe9J/MqlZdJrQryFg/kMyazdlFLMhgOnA78ZURsk3Q9B28JlVPTzI7Q2Am96j2ZV6m8TGpVkLd4IJ8xmbWLWvoMeoCeiNiWtu8iSw6vpds/pN/7iuqXm8FxUDM7mplZ41SdDCLiVWCPpJNTUWFmx+IZHEtndrwkjSqaBryRbidtBGZKGpk6jmemMjMza5JaJ6r7S+COtDD4buAbZAlmnaQFZCtFFSbwug84j2zpv3dSXSKiV9LVwKOp3lUR0VtjXGZmNgQ1JYOIeAoondkRslZCad0AFlV4n1XAqlpiMTOz6vkJZDMzczIwMzMnA7NKhnmqFWsnTgZm5Y3HU61YG3EyMCvx5ptvAvwenmrF2oiTgVmJ3bt3Q7aA/T9IelLSLZKOo2SqFaBuU62YtVqtzxmYHXX6+/sBPgnc1IypVho55xbUf96tPMwh5RjqH4eTgVmJjo4OgPdKplpZTJpqJU3AONipVjpLyrtKP6+Rc25B/efdysMcUo6h/nH4NpFZiU9/+tMA73mqFWsnbhmYlfcSnmrF2oiTgVl5v4kIT7VibcO3iczMzMnAzMycDMzMDCcDMzPDycDMzHAyMDMznAzMzAwnAzMzw8nAzMyoQzKQNCxN83tv2p4oaVta2enO9Dg/kj6WtrvT/glF77Ekle+SNKvWmMzMbGjq0TK4nGwVqILlwHVpNag+YEEqXwD0RcQfAdelekg6FbgY+ALZwh83ShpWh7jMzGyQakoGkjqAPwNuSdsCziab8hcOXw2qsErUXcA5qf4cYG1EvBsRL5BN9nVmLXGZmdnQ1DpR3d8B3yFbIhDgBOD1iOhP28UrO3246lNE9Et6I9UfB2wtes+Kq0E1chGQei8AUiovC2EU5C0eyGdMZu2i6mQg6SvAvoh4XFJnobhM1TjCvkGtBgWNXQSk3guAlMrLQhgFeYsH8hmTWbuopWXwJeB8SecBHweOJ2spjJA0PLUOCis+wcHVoHokDQd+H+il8ipRZmbWJFX3GUTEkojoiIgJZB3AD0bE14EtwIWpWulqUIVVoi5M9SOVX5xGG00EJgGPVBuXmZkNXSOeM7gSuEJSN1mfwMpUvhI4IZVfQVpgPCJ2AOvIlhXcACyKiA8aEJfZkHjYtLWTuqx0FhFdpIW+I2I3ZUYDRcRvObhMYOm+ZcCyesRiVkeFYdPHp+3CsOm1kn5MNlz6JoqGTUu6ONX785Jh058BHpB0ki92LI/8BLJZecfgYdPWRpwMzMobTzZs+ndpe9DDpoHiYdN7it6z4rBps1ary20is6PJvffeC9DfrGHTjXx+Bur/DE0engdxDPWPw8nArMRDDz0E2RDpF2nCsOlGPj8D9X+GJg/PgziG+sfh20RmJa655hqApz1s2tqJWwZmg3clsFbS94EnOXTY9D+mYdO9ZAmEiNghqTBsuh8Pm7YcczIwG4CHTVu78G0iMzNzMjAzMycDMzPDycDMzHAyMDMznAzMzAwnAzMzw8nAzMxwMjAzM5wMzMwMJwMzM8PJwMzMcDIwMzNqSAaSxkvaImmnpB2SLk/loyRtkvR8+j0ylUvSCkndkp6WdHrRe81L9Z+XNK/SZ5qZWWPU0jLoB/46Ij4PTAMWSToVWAxsjohJwOa0DXAu2eIek8iW+LsJsuQBLAWmkk0PvLSQQMzMrDmqTgYRsTcinkiv3wJ2ki32PQdYnaqtBi5Ir+cAt0VmK9kSgmOBWcCmiOiNiD5gEzC72rjMzGzo6rK4jaQJwJ8A24AxEbEXsoQhaXSqNg7YU3RYTyqrVF7ucxq2cHi9Fw0vlZfFswvyFg/kMyazdlFzMpD0KeBfgL+KiDclVaxapiwGKD+8sIELh9d70fBSeVk8uyBv8UB+YtqzZw/ASZJ2Ar8Dbo6I69MtzTuBCcCLwEUR0afspL8eOA94B7i00GpOfWD/Pb319yNiNWY5VNNoIknHkCWCOyLiJ6n4tXT7h/R7XyrvAcYXHd4BvDJAuVlLDB8+HKDH/WHWTmoZTSSyhcB3RsQPi3atBwojguYB9xSVX5JGFU0D3ki3kzYCMyWNTP9QZqYys5YYO3YsZFf47g+ztlHLbaIvAf8FeEbSU6nsvwHXAuskLQBe4uBC4feRNaO7yf6hfQMgInolXQ08mupdFRG9NcRlVjfN6A9rZF8Y1L8/LA99O46h/nFUnQwi4v9R/n4/wDll6gewqMJ7rQJWVRuLWSM0qz+skX1hUP/+sDz07TiG+sfhJ5DNyhPuD7M2UpehpUeDdf+6varjLmrwKCRrvqwRyx8Cv6jQH3Yth/eHXSZpLVln8RvpNtJG4AdFncYzgSVN+ApmQ+ZkYFbioYceAjgBONv9YdYunAzMSnz5y18GeDwippTZ7f4wOyq5z8DMzJwMzMzMycDMzHAyMDMznAzMzAwnAzMzw8nAzMxwMjAzM5wMzMwMJwMzM8PJwMzMcDIwMzM8UV3NBjv19TFv//aQup762szyxC0DMzNzMjAzMycDMzMjR30GkmYD1wPDgFsi4toWh9RQXmazfbTbuW0fTbloGUgaBvwIOBc4FZgr6dTWRmVWO5/b9lGRl5bBmUB3ROwGSAuLzwGebWlUOVRti6JU6eimStwSqVnLz223Qm0w8pIMxgF7irZ7gKmllSQtBBamzQOSdlV4vxOB/XWNsHZ5i2lQ8fx5EwIpkqc/oz+s0/sc8dzO63k9wN99Hv6eHMNBQ4mj4nmdl2SgMmVxWEHEzcDNR3wz6bEKi5m3TN5iyls8kM+Y6uCI5/ZH7bzOQxyOof5x5KLPgOxqaXzRdgfwSotiMasnn9v2kZCXZPAoMEnSREnHAhcD61sck1k9+Ny2j4Rc3CaKiH5JlwEbyYbfrYqIHTW85RGb3C2Qt5jyFg/kM6aa1PnczsufTx7icAwH1SUORRx2a97MzNpMXm4TmZlZCzkZmJnZ0ZUMJM2WtEtSt6TFOYhnlaR9kurzpFgdSBovaYuknZJ2SLo8BzF9XNIjkv4txfS9VseUN604t8udv5JGSdok6fn0e2SDYyh7vrYgjrLnaBoYsC3FcWcaJNBQkoZJelLSvfWM4ahJBjl97P9WYHaLYyjVD/x1RHwemAYsysGf07vA2RFxGvDHwGxJ01ocU2608Ny+lcPP38XA5oiYBGxO241U6XxtdhyVztHlwHUpjj5gQYPjALgc2Fm0XZcYjppkQNFj/xHxHlB47L9lIuLnQG8rYygVEXsj4on0+i2yk2pci2OKiDiQNo9JPx7ZcFBLzu0K5+8cYHV6vRq4oMExVDpfmx1HpXP0bOCuZsUhqQP4M+CWtK16xXA0JYNyj/239D+5vJM0AfgTYFtrI/mw6fsUsA/YFBEtjylH8nRuj4mIvZD9Rw2MbtYHl5yvTY+j9BwFfgW8HhH9qUoz/l7+DvgO8Lu0fUK9YjiaksGgprSwjKRPAf8C/FVEvNnqeCLig4j4Y7IndM+U5FnSDmr7czsP52vpOQp8vly1Rn2+pK8A+yLi8eLiesVwNCUDP/Y/SJKOIfuHdUdE/KTV8RSLiNeBLvLX19JKeTq3X5M0FiD93tfoD6xwvjY9joKic3QaMEJS4eHdRv+9fAk4X9KLZLcKzyZrKdQlhqMpGfix/0FI9xhXAjsj4oetjgdA0h9IGpFefwL4U+CXrY0qV/J0bq8H5qXX84B7GvlhA5yvzY6j3Dm6E9gCXNiMOCJiSUR0RMQEsnPgwYj4et1iiIij5gc4D3iO7F7ed3MQzxpgL/A+2dXdghzE9GWyZuTTwFPp57wWx/QfgSdTTNuB/9HqP6e8/bTi3C53/pLdo94MPJ9+j2pwDGXP1xbEUfYcBT4HPAJ0A/8MfKxJfzedwL31jMHTUZiZ2VF1m8jMzKrkZGBmZk4GZmbmZGBmZjgZmJkZTgZmZoaTgZmZAf8fRtAJVq1M7z0AAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 2 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 画两张对比图，左边log过的更偏态\n",
    "fig, ax = plt.subplots()\n",
    "plt.subplot(121) \n",
    "df_log['NA_Sales_log'].hist(color='#A9C5D3')\n",
    "\n",
    "plt.subplot(122) \n",
    "df_log['NA_Sales'].hist(color='#A9C5D3')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "上面是手动的，还有模块化的BoxCox，这里暂不做示例"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 日期相关特征\n",
    "将时间特征转换成可以应用的数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "import datetime\n",
    "from dateutil.parser import parse\n",
    "import pytz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2020-12-16 10:30:00.360000+00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2019-04-16 12:15:00.250000+00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2018-10-16 08:30:00.750000+00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2019-01-16 23:30:00.255500+00:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               Time\n",
       "0  2020-12-16 10:30:00.360000+00:00\n",
       "1  2019-04-16 12:15:00.250000+00:00\n",
       "2  2018-10-16 08:30:00.750000+00:00\n",
       "3  2019-01-16 23:30:00.255500+00:00"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "time_stamps = ['2020-12-16 10:30:00.360000+00:00','2019-04-16 12:15:00.250000+00:00',\n",
    "              '2018-10-16 08:30:00.750000+00:00','2019-01-16 23:30:00.255500+00:00']\n",
    "\n",
    "df = pd.DataFrame(time_stamps, columns=['Time'])\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([Timestamp('2020-12-16 10:30:00.360000+0000', tz='UTC'),\n",
       "       Timestamp('2019-04-16 12:15:00.250000+0000', tz='UTC'),\n",
       "       Timestamp('2018-10-16 08:30:00.750000+0000', tz='UTC'),\n",
       "       Timestamp('2019-01-16 23:30:00.255500+0000', tz='UTC')],\n",
       "      dtype=object)"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ts_objs = np.array([pd.Timestamp(item) for item in np.array(df.Time)])\n",
    "df['TS_obj'] = ts_objs\n",
    "ts_objs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Time</th>\n",
       "      <th>Year</th>\n",
       "      <th>Month</th>\n",
       "      <th>day</th>\n",
       "      <th>DayOfWeek</th>\n",
       "      <th>WeekDayName</th>\n",
       "      <th>DayOfYear</th>\n",
       "      <th>WeekOfYear</th>\n",
       "      <th>Quarter</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2020-12-16 10:30:00.360000+00:00</td>\n",
       "      <td>2020</td>\n",
       "      <td>12</td>\n",
       "      <td>16</td>\n",
       "      <td>2</td>\n",
       "      <td>Wednesday</td>\n",
       "      <td>351</td>\n",
       "      <td>51</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2019-04-16 12:15:00.250000+00:00</td>\n",
       "      <td>2019</td>\n",
       "      <td>4</td>\n",
       "      <td>16</td>\n",
       "      <td>1</td>\n",
       "      <td>Tuesday</td>\n",
       "      <td>106</td>\n",
       "      <td>16</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2018-10-16 08:30:00.750000+00:00</td>\n",
       "      <td>2018</td>\n",
       "      <td>10</td>\n",
       "      <td>16</td>\n",
       "      <td>1</td>\n",
       "      <td>Tuesday</td>\n",
       "      <td>289</td>\n",
       "      <td>42</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2019-01-16 23:30:00.255500+00:00</td>\n",
       "      <td>2019</td>\n",
       "      <td>1</td>\n",
       "      <td>16</td>\n",
       "      <td>2</td>\n",
       "      <td>Wednesday</td>\n",
       "      <td>16</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               Time  Year  Month  day  DayOfWeek WeekDayName  \\\n",
       "0  2020-12-16 10:30:00.360000+00:00  2020     12   16          2   Wednesday   \n",
       "1  2019-04-16 12:15:00.250000+00:00  2019      4   16          1     Tuesday   \n",
       "2  2018-10-16 08:30:00.750000+00:00  2018     10   16          1     Tuesday   \n",
       "3  2019-01-16 23:30:00.255500+00:00  2019      1   16          2   Wednesday   \n",
       "\n",
       "   DayOfYear  WeekOfYear  Quarter  \n",
       "0        351          51        4  \n",
       "1        106          16        2  \n",
       "2        289          42        4  \n",
       "3         16           3        1  "
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['Year'] = df['TS_obj'].apply(lambda d: d.year)\n",
    "df['Month'] = df['TS_obj'].apply(lambda d: d.month)\n",
    "df['Day'] = df['TS_obj'].apply(lambda d: d.day)\n",
    "df['DayOfWeek'] = df['TS_obj'].apply(lambda d: d.dayofweek)\n",
    "df['WeekDayName'] = df['TS_obj'].apply(lambda d: d.weekday_name)\n",
    "df['DayOfYear'] = df['TS_obj'].apply(lambda d: d.dayofyear)\n",
    "df['WeekOfYear'] = df['TS_obj'].apply(lambda d: d.weekofyear)\n",
    "df['Quarter'] = df['TS_obj'].apply(lambda d: d.quarter)\n",
    "\n",
    "df[['Time','Year','Month','day','DayOfWeek','WeekDayName','DayOfYear','WeekOfYear','Quarter']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "这样就能从时间数据中获取很多数据，不同场景对不同数据有需求，如外卖则会关注周末和季节等。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Time</th>\n",
       "      <th>Hour</th>\n",
       "      <th>Minute</th>\n",
       "      <th>Second</th>\n",
       "      <th>Microsecond</th>\n",
       "      <th>Utcoffset</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2020-12-16 10:30:00.360000+00:00</td>\n",
       "      <td>10</td>\n",
       "      <td>30</td>\n",
       "      <td>0</td>\n",
       "      <td>360000</td>\n",
       "      <td>0 days</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2019-04-16 12:15:00.250000+00:00</td>\n",
       "      <td>12</td>\n",
       "      <td>15</td>\n",
       "      <td>0</td>\n",
       "      <td>250000</td>\n",
       "      <td>0 days</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2018-10-16 08:30:00.750000+00:00</td>\n",
       "      <td>8</td>\n",
       "      <td>30</td>\n",
       "      <td>0</td>\n",
       "      <td>750000</td>\n",
       "      <td>0 days</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2019-01-16 23:30:00.255500+00:00</td>\n",
       "      <td>23</td>\n",
       "      <td>30</td>\n",
       "      <td>0</td>\n",
       "      <td>255500</td>\n",
       "      <td>0 days</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               Time  Hour  Minute  Second  Microsecond  \\\n",
       "0  2020-12-16 10:30:00.360000+00:00    10      30       0       360000   \n",
       "1  2019-04-16 12:15:00.250000+00:00    12      15       0       250000   \n",
       "2  2018-10-16 08:30:00.750000+00:00     8      30       0       750000   \n",
       "3  2019-01-16 23:30:00.255500+00:00    23      30       0       255500   \n",
       "\n",
       "  Utcoffset  \n",
       "0    0 days  \n",
       "1    0 days  \n",
       "2    0 days  \n",
       "3    0 days  "
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['Hour'] = df['TS_obj'].apply(lambda d: d.hour)\n",
    "df['Minute'] = df['TS_obj'].apply(lambda d: d.minute)\n",
    "df['Second'] = df['TS_obj'].apply(lambda d: d.second)\n",
    "df['Microsecond'] = df['TS_obj'].apply(lambda d: d.microsecond)\n",
    "df['Utcoffset'] = df['TS_obj'].apply(lambda d: d.utcoffset())  # UTC时间位移\n",
    "\n",
    "df[['Time','Hour','Minute','Second','Microsecond','Utcoffset']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "又比如按早晚切分时间"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Time</th>\n",
       "      <th>Hour</th>\n",
       "      <th>TimeOfDayBin</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2020-12-16 10:30:00.360000+00:00</td>\n",
       "      <td>10</td>\n",
       "      <td>Morning</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2019-04-16 12:15:00.250000+00:00</td>\n",
       "      <td>12</td>\n",
       "      <td>Afternoon</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2018-10-16 08:30:00.750000+00:00</td>\n",
       "      <td>8</td>\n",
       "      <td>Morning</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2019-01-16 23:30:00.255500+00:00</td>\n",
       "      <td>23</td>\n",
       "      <td>Night</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               Time  Hour TimeOfDayBin\n",
       "0  2020-12-16 10:30:00.360000+00:00    10      Morning\n",
       "1  2019-04-16 12:15:00.250000+00:00    12    Afternoon\n",
       "2  2018-10-16 08:30:00.750000+00:00     8      Morning\n",
       "3  2019-01-16 23:30:00.255500+00:00    23        Night"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hour_bins = [-1, 5, 11, 16, 21, 23]\n",
    "bin_names = ['Late Night', 'Morning', 'Afternoon', 'Evening', 'Night']\n",
    "df['TimeOfDayBin'] = pd.cut(df['Hour'],bins=hour_bins,labels=bin_names)\n",
    "\n",
    "df[['Time','Hour','TimeOfDayBin']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
